In [1]:
import pandas as pd
import numpy as np
Read in the data..
In [2]:
data = pd.read_csv('/Users/Frankie/Documents/Dissertation/Data/pancreatic/24hProbeExpressionValues.csv')
data[:5]
Out[2]:
The columns are the instances and rows the features so we need to transpose the dataset.
In [3]:
data = data.T
Read in the labels...
In [4]:
label = pd.read_csv('/Users/Frankie/Documents/Dissertation/Data/pancreatic/24hTargets.csv')
In [5]:
label[:5]
Out[5]:
We are using the OAC labeling...
In [6]:
label = label[['FileName', 'OAC']]
label[:5]
Out[6]:
Join the data and labels on the FileName, remove any null rows and create a label column with 0s and 1s.
In [7]:
joined_tables = label.join(data, on='FileName', how = 'outer')
joined_tables = joined_tables[pd.notnull(joined_tables['Probe1'])]
joined_tables['label'] = np.where(joined_tables['OAC']=='Mild', 0, 1)
joined_tables[:5]
Out[7]:
Drop the FileName and OAC columns and export as a CSV file.
In [8]:
joined_tables.drop(['FileName','OAC'], axis=1).to_csv("/Users/Frankie/Desktop/pancreatic.csv",index=False)